#pragma once
#include "stm32f4xx_hal.h"
#include <cmath>

#define __FPU_USED

#define __FPU_PRESENT

#define ARM_MATH_CM4

#define __CC_ARM

#define ARM_MATH_MATRIX_CHECK

#define ARM_MATH_LOOPUNROLL

#define ARM_MATH_ROUNDING

#define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)])

#define SWAP_ROWS_F32(A,COL,i,j)       \
{                                      \
  int32_t w;                           \
  float tmp;                       \
  float *dataI = (A)->pData;       \
  float *dataJ = (A)->pData;       \
  const int32_t numCols = (A)->numCols;\
  const int32_t nb = numCols - COL;    \
                                       \
  dataI += i*numCols + (COL);          \
  dataJ += j*numCols + (COL);          \
                                       \
                                       \
  for(w=0;w < nb; w++)                 \
  {                                    \
     tmp = *dataI;                     \
     *dataI++ = *dataJ;                \
     *dataJ++ = tmp;                   \
  }                                    \
}

#define SCALE_ROW_F32(A,COL,v,i)       \
{                                      \
  int32_t w;                           \
  float *data = (A)->pData;        \
  const int32_t numCols = (A)->numCols;\
  const int32_t nb = numCols - COL;    \
                                       \
  data += i*numCols + (COL);           \
                                       \
  for(w=0;w < nb; w++)                 \
  {                                    \
     *data++ *= v;                     \
  }                                    \
}

#define MAC_ROW_F32(COL,A,i,v,B,j)     \
{                                      \
  int32_t w;                           \
  float *dataA = (A)->pData;       \
  float *dataB = (B)->pData;       \
  const int32_t numCols = (A)->numCols;\
  const int32_t nb = numCols-(COL);    \
                                       \
  dataA = dataA + i*numCols + (COL);   \
  dataB = dataB + j*numCols + (COL);   \
                                       \
  for(w=0;w < nb; w++)                 \
  {                                    \
     *dataA++ += v* *dataB++;          \
  }                                    \
}

#define MAS_ROW_F32(COL,A,i,v,B,j)     \
{                                      \
  int32_t w;                           \
  float *dataA = (A)->pData;       \
  float *dataB = (B)->pData;       \
  const int32_t numCols = (A)->numCols;\
  const int32_t nb = numCols-(COL);    \
                                       \
  dataA = dataA + i*numCols + (COL);   \
  dataB = dataB + j*numCols + (COL);   \
                                       \
  for(w=0;w < nb; w++)                 \
  {                                    \
     *dataA++ -= v* *dataB++;          \
  }                                    \
}

typedef struct
{
    uint16_t numRows;     /**< number of rows of the matrix.     */
    uint16_t numCols;     /**< number of columns of the matrix.  */
    float* pData;     /**< points to the data of the matrix. */
} matrix_instance_f32;

typedef enum
{
    MATH_SUCCESS = 0,                /**< No error */
    MATH_ARGUMENT_ERROR = -1,        /**< One or more arguments are incorrect */
    MATH_LENGTH_ERROR = -2,          /**< Length of data buffer is incorrect */
    MATH_SIZE_MISMATCH = -3,         /**< Size of matrices is not compatible with the operation. */
    MATH_NANINF = -4,                /**< Not-a-number (NaN) or infinity is generated */
    MATH_SINGULAR = -5,              /**< Generated by matrix inversion if the input matrix is singular and cannot be inverted. */
    MATH_TEST_FAILURE = -6           /**< Test Failed  */
} caculate_status;

void mat_init_f32(matrix_instance_f32* S, uint16_t nRows, uint16_t nColumns, float* pData);
void mat_copy_f32(float32_t* pSrc, float32_t* pDst, uint32_t blockSize);
caculate_status mat_add_f32(const matrix_instance_f32* pSrcA, const matrix_instance_f32* pSrcB, matrix_instance_f32* pDst);
caculate_status mat_sub_f32(const matrix_instance_f32* pSrcA, const matrix_instance_f32* pSrcB, matrix_instance_f32* pDst);
caculate_status mat_mult_f32(const matrix_instance_f32* pSrcA, const matrix_instance_f32* pSrcB, matrix_instance_f32* pDst);
caculate_status mat_trans_f32(const matrix_instance_f32* pSrc, matrix_instance_f32* pDst);
caculate_status mat_inverse_f32(const matrix_instance_f32* src, matrix_instance_f32* dst);
